{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b2bd82d7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Python 3.12.1\n"
     ]
    }
   ],
   "source": [
    "!python -V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "41062d8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c984c564",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8b135c2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.metrics import root_mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1464985f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/05/22 12:10:31 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747915831003, experiment_id='1', last_update_time=1747915831003, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import mlflow\n",
    "\n",
    "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
    "mlflow.set_experiment(\"nyc-taxi-experiment\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f9e6479e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_dataframe(filename):\n",
    "    df = pd.read_parquet(filename)\n",
    "\n",
    "    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime\n",
    "    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)\n",
    "\n",
    "    df = df[(df.duration >= 1) & (df.duration <= 60)]\n",
    "\n",
    "    categorical = ['PULocationID', 'DOLocationID']\n",
    "    df[categorical] = df[categorical].astype(str)\n",
    "\n",
    "    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "8029eba0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')\n",
    "df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c5cbfc25",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']\n",
    "numerical = ['trip_distance']\n",
    "\n",
    "dv = DictVectorizer()\n",
    "\n",
    "train_dicts = df_train[categorical + numerical].to_dict(orient='records')\n",
    "X_train = dv.fit_transform(train_dicts)\n",
    "\n",
    "val_dicts = df_val[categorical + numerical].to_dict(orient='records')\n",
    "X_val = dv.transform(val_dicts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "1e9fb68b",
   "metadata": {},
   "outputs": [],
   "source": [
    "target = 'duration'\n",
    "y_train = df_train[target].values\n",
    "y_val = df_val[target].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "5f56e97b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "b1f71259-0e96-4725-9151-dc274f4e984c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "0ee76202-3ff6-4bd7-b70e-b8d1c87c26d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "models_folder = Path('models')\n",
    "models_folder.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "0e8cd729",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/python/3.12.1/lib/python3.12/site-packages/xgboost/callback.py:386: UserWarning: [12:17:00] WARNING: /workspace/src/objective/regression_obj.cu:250: reg:linear is now deprecated in favor of reg:squarederror.\n",
      "  self.starting_round = model.num_boosted_rounds()\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation-rmse:11.44482\n",
      "[1]\tvalidation-rmse:10.77202\n",
      "[2]\tvalidation-rmse:10.18363\n",
      "[3]\tvalidation-rmse:9.67396\n",
      "[4]\tvalidation-rmse:9.23166\n",
      "[5]\tvalidation-rmse:8.84808\n",
      "[6]\tvalidation-rmse:8.51883\n",
      "[7]\tvalidation-rmse:8.23597\n",
      "[8]\tvalidation-rmse:7.99320\n",
      "[9]\tvalidation-rmse:7.78709\n",
      "[10]\tvalidation-rmse:7.61022\n",
      "[11]\tvalidation-rmse:7.45952\n",
      "[12]\tvalidation-rmse:7.33049\n",
      "[13]\tvalidation-rmse:7.22098\n",
      "[14]\tvalidation-rmse:7.12713\n",
      "[15]\tvalidation-rmse:7.04752\n",
      "[16]\tvalidation-rmse:6.98005\n",
      "[17]\tvalidation-rmse:6.92232\n",
      "[18]\tvalidation-rmse:6.87112\n",
      "[19]\tvalidation-rmse:6.82740\n",
      "[20]\tvalidation-rmse:6.78995\n",
      "[21]\tvalidation-rmse:6.75792\n",
      "[22]\tvalidation-rmse:6.72994\n",
      "[23]\tvalidation-rmse:6.70547\n",
      "[24]\tvalidation-rmse:6.68390\n",
      "[25]\tvalidation-rmse:6.66421\n",
      "[26]\tvalidation-rmse:6.64806\n",
      "[27]\tvalidation-rmse:6.63280\n",
      "[28]\tvalidation-rmse:6.61924\n",
      "[29]\tvalidation-rmse:6.60773\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/python/3.12.1/lib/python3.12/site-packages/mlflow/xgboost/__init__.py:168: UserWarning: [12:17:36] WARNING: /workspace/src/c_api/c_api.cc:1427: Saving model in the UBJSON format as default.  You can use file extension: `json`, `ubj` or `deprecated` to choose between formats.\n",
      "  xgb_model.save_model(model_data_path)\n",
      "\u001b[31m2025/05/22 12:17:41 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🏃 View run stylish-auk-139 at: http://localhost:5000/#/experiments/1/runs/494792a9a00b48b1b1ef16a3dd8aeebe\n",
      "🧪 View experiment at: http://localhost:5000/#/experiments/1\n"
     ]
    }
   ],
   "source": [
    "with mlflow.start_run():\n",
    "    train = xgb.DMatrix(X_train, label=y_train)\n",
    "    valid = xgb.DMatrix(X_val, label=y_val)\n",
    "\n",
    "    best_params = {\n",
    "        'learning_rate': 0.09585355369315604,\n",
    "        'max_depth': 30,\n",
    "        'min_child_weight': 1.060597050922164,\n",
    "        'objective': 'reg:linear',\n",
    "        'reg_alpha': 0.018060244040060163,\n",
    "        'reg_lambda': 0.011658731377413597,\n",
    "        'seed': 42\n",
    "    }\n",
    "\n",
    "    mlflow.log_params(best_params)\n",
    "\n",
    "    booster = xgb.train(\n",
    "        params=best_params,\n",
    "        dtrain=train,\n",
    "        num_boost_round=30,\n",
    "        evals=[(valid, 'validation')],\n",
    "        early_stopping_rounds=50\n",
    "    )\n",
    "\n",
    "    y_pred = booster.predict(valid)\n",
    "    rmse = root_mean_squared_error(y_val, y_pred)\n",
    "    mlflow.log_metric(\"rmse\", rmse)\n",
    "\n",
    "    with open(\"models/preprocessor.b\", \"wb\") as f_out:\n",
    "        pickle.dump(dv, f_out)\n",
    "    mlflow.log_artifact(\"models/preprocessor.b\", artifact_path=\"preprocessor\")\n",
    "\n",
    "    mlflow.xgboost.log_model(booster, artifact_path=\"models_mlflow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2108f4b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "0848c9d6c7d415ad6c477ff7ff8e98694d1a4aa96d0deee89244642e6b630036"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
